A leading car trading company which connects both sellers and buyers through a online trading platform wants to analyze their data to improve its business.
The objectives is to :
# Importing libraries
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from fancyimpute import KNN
import itertools
import graphviz
import shap
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/graphviz-2.38/release/bin/'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
pd.set_option('display.max_columns', None) # or 1000
pd.set_option('display.max_rows', None) # or 1000
pd.set_option('display.max_colwidth', -1) # or 199
# Reading the data
car_Data = pd.read_csv('Auto1-DS-TestData.csv')
car_Data
car_Data.dtypes
car_Data.describe(include = 'all')
make: alfa-romero, audi, bmw, chevrolet, dodge, honda, isuzu, jaguar, mazda, mercedes-benz, mercury, mitsubishi, nissan, peugot, plymouth, porsche, renault, saab, subaru, toyota, volkswagen, volvo
fuel-type: diesel, gas.
Cleansing Data :
# Replacing '?' with nulls#
car_Data = car_Data.replace('?', np.NaN)
# Checking for nulls
car_Data.isnull().sum()
car_Data = car_Data.dropna(subset=['price'])
The msno.matrix nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.
msno.matrix(car_Data)
From the above plot, we can see that column normalized_losses has the maximum missing values
The missingno correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the presence of another:
msno.heatmap(car_Data)
There seems to be high correlation of missing values between stroke & bore and horsepower & peak-rpm.
Need to convert the following data types to its appropriate type :
car_Data['symboling'] = car_Data['symboling'].astype('object')
for column in ['normalized-losses','bore','stroke','peak-rpm','price','horsepower']:
car_Data[column] = car_Data[column].astype('float')
car_Data.dtypes
for column in car_Data.columns:
if car_Data[column].dtype in (['int64','float64']):
sns.distplot(car_Data[column].dropna(), kde=False, rug=True)
plt.show()
else:
sns.countplot(car_Data[column] ,palette="deep")
plt.xticks(rotation = 90)
plt.show()
Analysis from the above plots :
Numerical VS Numerical Features
sns.pairplot(car_Data.dropna())
plt.figure(figsize=(16,16))
sns.heatmap(car_Data.corr())
plt.show()
Categorical VS Categorical Features
cat_Cols = car_Data.select_dtypes(include=['object']).columns
cat_Cols_Groups = list(itertools.combinations(cat_Cols, 2))
for group in cat_Cols_Groups:
sns.countplot(x = group[0], hue=group[1], data=car_Data)
plt.xticks(rotation = 90)
plt.show()
Categorical VS Numerical
for column in car_Data.select_dtypes(include=['int64','float64']).columns:
sns.boxplot(x = 'symboling', y= column, data = car_Data)
plt.show()
for column in car_Data.select_dtypes(include=['object']).columns:
sns.boxplot(x = column, y= 'price', data = car_Data)
plt.show()
for column in car_Data.select_dtypes(include=['object']).columns:
sns.boxplot(x = column, y= 'normalized-losses', data = car_Data)
plt.show()
Preprocessing to be done :
# Convert categorical variables to numeric variables
cols_to_transform = car_Data.select_dtypes(include=['object']).columns
car_Data = pd.get_dummies(columns=cols_to_transform, data=car_Data, prefix=cols_to_transform, prefix_sep='_', drop_first=True)
cols = car_Data.columns
print(cols)
car_Data.head()
car_Data_Imputed = pd.DataFrame(KNN(3).fit_transform(car_Data))
car_Data_Imputed.columns = cols
# Checking for nulls
car_Data_Imputed.isnull().sum()
cols_to_transform
# Scaling the features
num_Cols = [x for x in car_Data_Imputed.columns if x not in ['symboling_-1', 'symboling_0', 'symboling_1', 'symboling_2', 'symboling_3', 'make_bmw', 'make_chevrolet', 'make_dodge', 'make_honda', 'make_isuzu', 'make_jaguar', 'make_mazda', 'make_mercedes-benz', 'make_mercury', 'make_mitsubishi', 'make_nissan', 'make_peugot', 'make_plymouth', 'make_porsche', 'make_renault', 'make_saab', 'make_subaru', 'make_toyota', 'make_volkswagen', 'make_volvo', 'fuel-type_gas','aspiration_turbo', 'num-of-doors_two', 'body-style_hardtop', 'body-style_hatchback', 'body-style_sedan', 'body-style_wagon', 'drive-wheels_fwd', 'drive-wheels_rwd', 'engine-location_rear', 'engine-type_l', 'engine-type_ohc', 'engine-type_ohcf', 'engine-type_ohcv', 'engine-type_rotor', 'num-of-cylinders_five', 'num-of-cylinders_four', 'num-of-cylinders_six', 'num-of-cylinders_three', 'num-of-cylinders_twelve', 'num-of-cylinders_two', 'fuel-system_2bbl', 'fuel-system_4bbl', 'fuel-system_idi', 'fuel-system_mfi', 'fuel-system_mpfi', 'fuel-system_spdi', 'fuel-system_spfi']]
cat_Cols = ['symboling_-1', 'symboling_0', 'symboling_1', 'symboling_2', 'symboling_3', 'make_bmw', 'make_chevrolet', 'make_dodge', 'make_honda', 'make_isuzu', 'make_jaguar', 'make_mazda', 'make_mercedes-benz', 'make_mercury', 'make_mitsubishi', 'make_nissan', 'make_peugot', 'make_plymouth', 'make_porsche', 'make_renault', 'make_saab', 'make_subaru', 'make_toyota', 'make_volkswagen', 'make_volvo', 'fuel-type_gas','aspiration_turbo', 'num-of-doors_two', 'body-style_hardtop', 'body-style_hatchback', 'body-style_sedan', 'body-style_wagon', 'drive-wheels_fwd', 'drive-wheels_rwd', 'engine-location_rear', 'engine-type_l', 'engine-type_ohc', 'engine-type_ohcf', 'engine-type_ohcv', 'engine-type_rotor', 'num-of-cylinders_five', 'num-of-cylinders_four', 'num-of-cylinders_six', 'num-of-cylinders_three', 'num-of-cylinders_twelve', 'num-of-cylinders_two', 'fuel-system_2bbl', 'fuel-system_4bbl', 'fuel-system_idi', 'fuel-system_mfi', 'fuel-system_mpfi', 'fuel-system_spdi', 'fuel-system_spfi']
scaler = StandardScaler()
price = car_Data_Imputed.price
car_Num_Data = car_Data_Imputed[num_Cols]
car_Cat_Data = car_Data_Imputed[cat_Cols]
car_Data_Scaled = pd.DataFrame(scaler.fit_transform(car_Num_Data))
car_Data_Scaled.columns = num_Cols
car_Data_Scaled = pd.concat([car_Data_Scaled, car_Cat_Data], axis=1)
#car_Data_Scaled.columns = cols
car_Data_Scaled['price'] = price
car_Data_Scaled.head()
Split the data into Train and Test
X, y = car_Data_Scaled.loc[:, car_Data_Scaled.columns!='price'].values, car_Data_Scaled.loc[:,'price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
Starting with a simple model !!!
linReg = LinearRegression()
# create the RFE model for the svm classifier
# and select attributes
linear_Model = linReg.fit(X_train, y_train)
y_train_pred = linear_Model.predict(X_train)
y_test_pred = linear_Model.predict(X_test)
$$MAE = \dfrac{1}{n}\times|\sum_{i = 1}^{n}y_{i} - \hat{y_{i}}|$$
$$MSE = \dfrac{1}{n}\times(\sum_{i = 1}^{n}y_{i} - \hat{y_{i}})^2$$
$$RMSE = \sqrt{\dfrac{1}{n}\times(\sum_{i = 1}^{n}y_{i} - \hat{y_{i}})^2}$$
$$MAPE = \dfrac{100}{n}\times\mid\dfrac{\sum_{i = 1}^{n}y_{i} - \hat{y_{i}}}{y_{i}}\mid$$
print('The Mean absolute error on train data: {} \n'.format(mean_absolute_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute error on test data: {} \n'.format(mean_absolute_error(y_pred = y_test_pred, y_true = y_test)))
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
print('The Mean absolute percentage error on train data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute percentage error on test data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_test_pred, y_true = y_test)))
print('The R2 Score on train data: {} \n'.format(r2_score(y_pred = y_train_pred, y_true = y_train)))
print('The R2 Score on test data: {} \n'.format(r2_score(y_pred = y_test_pred, y_true = y_test)))
max_depth : int or None, optional (default=None)
The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
# set of parameters to test
param_grid = {'max_depth': range(1,11)}
dt = tree.DecisionTreeRegressor()
GS = GridSearchCV(dt, param_grid, cv=10)
GS.fit(X_train, y_train)
GS.best_params_
importances = GS.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
pd.DataFrame([car_Data_Scaled.columns[indices],np.sort(importances)[::-1]])
dot_data = tree.export_graphviz(GS.best_estimator_, out_file=None, feature_names=car_Data_Scaled.drop(labels=['price'],axis=1).columns,class_names='price', filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph
y_train_pred = GS.predict(X_train)
y_test_pred = GS.predict(X_test)
print('The Mean absolute error on train data: {} \n'.format(mean_absolute_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute error on test data: {} \n'.format(mean_absolute_error(y_pred = y_test_pred, y_true = y_test)))
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
print('The Mean absolute percentage error on train data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute percentage error on test data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_test_pred, y_true = y_test)))
print('The R2 Score on train data: {} \n'.format(r2_score(y_pred = y_train_pred, y_true = y_train)))
print('The R2 Score on test data: {} \n'.format(r2_score(y_pred = y_test_pred, y_true = y_test)))
After, all the experience as a data scientist, I know that tree based ensembling models like Random Forest, xgboost will perform better in most of the situations as well as keeping the model simple by adding regularization and as well as fast during production by the use of GPU's.
n_estimators : integer, optional (default=10).
The number of trees in the forest.
max_depth : integer or None, optional (default=None)
The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
max_features : int, float, string or None, optional (default=”auto”)
The number of features to consider when looking for the best split.
min_samples_leaf : int, float, optional (default=1)
The minimum number of samples required to be at a leaf node
# set of parameters to test
param_grid = {
"n_estimators" : [250, 300],
"max_depth" : [1,5,10],
"max_features" : [3, 5],
"min_samples_leaf" : [1, 2, 4, 6, 8, 10]}
rf = RandomForestRegressor()
GS = GridSearchCV(rf, param_grid, cv=10)
GS.fit(X_train, y_train)
GS.best_params_
y_train_pred = GS.predict(X_train)
y_test_pred = GS.predict(X_test)
print('The Mean absolute error on train data: {} \n'.format(mean_absolute_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute error on test data: {} \n'.format(mean_absolute_error(y_pred = y_test_pred, y_true = y_test)))
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
print('The Mean absolute percentage error on train data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute percentage error on test data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_test_pred, y_true = y_test)))
print('The R2 Score on train data: {} \n'.format(r2_score(y_pred = y_train_pred, y_true = y_train)))
print('The R2 Score on test data: {} \n'.format(r2_score(y_pred = y_test_pred, y_true = y_test)))
def runXGB(train_X, train_y, test_X, test_y=None):
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.002
params["min_child_weight"] = 1
params["subsample"] = 0.9
params["colsample_bytree"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["seed"] = 1
plst = list(params.items())
num_rounds = 2500
xgtrain = xgb.DMatrix(train_X, label=train_y, feature_names=cols[cols!='price'])
xgtest = xgb.DMatrix(test_X, feature_names=cols[cols!='price'])
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest)
return pred_test_y, model
y_train_pred, model = runXGB(np.array(X_train), y_train, np.array(X_train))
y_test_pred, model = runXGB(np.array(X_train), y_train, np.array(X_test))
print('The Mean absolute error on train data: {} \n'.format(mean_absolute_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute error on test data: {} \n'.format(mean_absolute_error(y_pred = y_test_pred, y_true = y_test)))
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
print('The Mean absolute percentage error on train data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute percentage error on test data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_test_pred, y_true = y_test)))
print('The R2 Score on train data: {} \n'.format(r2_score(y_pred = y_train_pred, y_true = y_train)))
print('The R2 Score on test data: {} \n'.format(r2_score(y_pred = y_test_pred, y_true = y_test)))
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()
xgb.to_graphviz(model,)
SHAP (SHapley Additive exPlanations) is a unified approach to explain the output of any machine learning model. SHAP connects game theory with local explanations, uniting several previous methods [1-7] and representing the only possible consistent and locally accurate additive feature attribution method based on expectations (see the SHAP NIPS paper for details). Link : https://github.com/slundberg/shap
# load JS visualization code to notebook
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(pd.DataFrame(X_train, columns=cols[cols!='price']))
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], pd.DataFrame(X_train, columns=cols[cols!='price']).iloc[0,:])
The above explanation shows features each contributing to push the model output from the base value (the average model output over the training dataset we passed) to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue.
shap.summary_plot(shap_values, pd.DataFrame(X_train, columns=cols[cols!='price']))
To get an overview of which features are most important for a model we can plot the SHAP values of every feature for every sample. The plot below sorts features by the sum of SHAP value magnitudes over all samples, and uses SHAP values to show the distribution of the impacts each feature has on the model output. The color represents the feature value (red high, blue low). This reveals for example that a high curb-weight increases the predicted car price.
sns.scatterplot(car_Data['engine-size'], car_Data['curb-weight'], hue=car_Data['price'])
Using the above plots, we can infer that the price of a car can be predicted just by using the curb-weight and engine-size with less error.
We can remove insurance-related features like symbolling and normalized-losses in predicting the price of a car. In-turn, we can use price, symboling and normalized-losses to buy a car from a seller.
Split the data into Train and Test
X, y = car_Data_Scaled.loc[:, [x for x in car_Data_Scaled.columns if x not in ['normalized-losses','price','symboling_-1','symboling_0', 'symboling_1', 'symboling_2','symboling_3']]].values, car_Data_Scaled.loc[:,'normalized-losses'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
def runXGB(train_X, train_y, test_X, test_y=None):
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.002
params["min_child_weight"] = 1
params["subsample"] = 0.9
params["colsample_bytree"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["seed"] = 1
plst = list(params.items())
num_rounds = 2500
xgtrain = xgb.DMatrix(train_X, label=train_y, feature_names=[x for x in car_Data_Scaled.columns if x not in ['normalized-losses','price','symboling_-1','symboling_0', 'symboling_1', 'symboling_2','symboling_3']])
xgtest = xgb.DMatrix(test_X, feature_names=[x for x in car_Data_Scaled.columns if x not in ['normalized-losses','price','symboling_-1','symboling_0', 'symboling_1', 'symboling_2','symboling_3']])
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest)
return pred_test_y, model
y_train_pred, model = runXGB(np.array(X_train), y_train, np.array(X_train))
y_test_pred, model = runXGB(np.array(X_train), y_train, np.array(X_test))
print('The Mean absolute error on train data: {} \n'.format(mean_absolute_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute error on test data: {} \n'.format(mean_absolute_error(y_pred = y_test_pred, y_true = y_test)))
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
print('The Mean absolute percentage error on train data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_train_pred, y_true = y_train)))
print('The Mean absolute percentage error on test data: {} \n'.format(mean_absolute_percentage_error(y_pred = y_test_pred, y_true = y_test)))
print('The R2 Score on train data: {} \n'.format(r2_score(y_pred = y_train_pred, y_true = y_train)))
print('The R2 Score on test data: {} \n'.format(r2_score(y_pred = y_test_pred, y_true = y_test)))
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()
xgb.to_graphviz(model)
# load JS visualization code to notebook
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(pd.DataFrame(X_train, columns=[x for x in car_Data_Scaled.columns if x not in ['normalized-losses','price','symboling_-1','symboling_0', 'symboling_1', 'symboling_2','symboling_3']]))
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], pd.DataFrame(X_train, columns=[x for x in car_Data_Scaled.columns if x not in ['normalized-losses','price','symboling_-1','symboling_0', 'symboling_1', 'symboling_2','symboling_3']]).iloc[0,:])
shap.summary_plot(shap_values, pd.DataFrame(X_train, columns=[x for x in car_Data_Scaled.columns if x not in ['normalized-losses','price','symboling_-1','symboling_0', 'symboling_1', 'symboling_2','symboling_3']]))
This reveals that a high height increases the normalized-losses.